#!/usr/bin/env python3
import json, math, numpy as np, pandas as pd, re

IN_CSV  = "outputs/lensing_plateau.csv"
OUT_JSON= "outputs/size_regression.json"

def rg_mid(label: str) -> float:
    if not isinstance(label, str): return float("nan")
    s = label.strip()
    s = s.replace("—","-").replace("–","-")  # normalize dashes
    m = re.match(r"\s*([0-9.]+)\s*-\s*([0-9.]+)\s*", s)
    if not m: return float("nan")
    a, b = float(m.group(1)), float(m.group(2))
    return 0.5*(a+b)

def main():
    df = pd.read_csv(IN_CSV)
    # keep certified stacks with finite A_theta
    ok = (df["claimable"].astype(str).str.lower()=="true") & pd.to_numeric(df["A_theta"], errors="coerce").notna()
    d  = df.loc[ok, ["Mstar_bin","R_G_bin","A_theta"]].copy()
    if d.empty:
        json.dump({}, open(OUT_JSON,"w"), indent=2)
        print("No claimable stacks with A_theta; wrote empty size_regression.json")
        return
    d["RG_mid"] = d["R_G_bin"].apply(rg_mid)
    d = d[np.isfinite(d["RG_mid"])]

    out = {}
    rng = np.random.default_rng(42)
    B = 2000

    for ms, g in d.groupby("Mstar_bin"):
        x = g["RG_mid"].to_numpy(dtype=float)
        y = g["A_theta"].to_numpy(dtype=float)
        n = len(g)
        if n < 3:
            out[ms] = {"n_stacks": int(n), "slope_Atheta_vs_RG": float("nan"),
                       "CI_16": float("nan"), "CI_84": float("nan")}
            continue
        # slope
        slope = np.polyfit(x, y, 1)[0]
        # bootstrap CI
        boots = np.empty(B, dtype=float)
        for i in range(B):
            idx = rng.integers(0, n, size=n)
            boots[i] = np.polyfit(x[idx], y[idx], 1)[0]
        lo, hi = np.percentile(boots, [16,84])
        out[ms] = {"n_stacks": int(n), "slope_Atheta_vs_RG": float(slope),
                   "CI_16": float(lo), "CI_84": float(hi)}

    with open(OUT_JSON, "w") as f:
        json.dump(out, f, indent=2)
    print(f"Wrote {OUT_JSON} with {len(out)} mass bins.")

if __name__ == "__main__":
    main()
